Load the necessary libraries
library(mgcv) #for GAMs
library(gratia) #for GAM plots
library(emmeans) #for marginal means etc
library(broom) #for tidy output
library(MuMIn) #for model selection and AICc
library(lubridate) #for processing dates
library(tidyverse) #for data wrangling
library(DHARMa) #for residuals diagnostics
library(performance) #for residual disagnostics
library(see) # to visualize residual diagnostics
library(patchwork) #for grids of plots
The Australian Institute of Marine Science (AIMS) have a long-term inshore marine water quality monitoring program in which water samples are collected and analysed from sites (reef.alias) across the GBR numerous times per year. The focus of this program is to report long-term condition and change in water quality parameters.
Although we do have latitude and longitudes, the nature of the spatial design predominantly reflects a series of transects that start near the mouth of a major river and extend northwards, yet mainly within the open coastal zone. As a result, this design is not well suited to any specific spatial analyses (since they are mainly one dimensional).
AIMS water quality monitoring
Format of aims.wq.csv data file
| LATITUDE | LONGITUDE | reef.alias | Water_Samples | Region | Subregion | Season | waterYear | NOx |
|---|---|---|---|---|---|---|---|---|
| -16.1 | 145. | Cape Trib… | AIMS | Wet T… | Barron D… | Dry | 2008 | 0.830 |
| -16.1 | 145. | Cape Trib… | AIMS | Wet T… | Barron D… | Wet | 2008 | 0.100 |
| -16.1 | 145. | Cape Trib… | AIMS | Wet T… | Barron D… | Dry | 2009 | 0.282 |
| -16.1 | 145. | Cape Trib… | AIMS | Wet T… | Barron D… | Wet | 2009 | 1.27 |
| -16.1 | 145. | Cape Trib… | AIMS | Wet T… | Barron D… | Dry | 2009 | 0.793 |
| -16.1 | 145. | Cape Trib… | AIMS | Wet T… | Barron D… | Dry | 2010 | 0.380 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| LATITUDE | - Latitudinal coordinate |
| LONGITUDE | - Longitudinal coordinate |
| reef.alias | - Internal AIMS reef name |
| Water_Samples | - Categorical label of who collected the data |
| Region | - The MMP region |
| Subregion | - The MMP subregion |
| Season | - A categorical listing of Wet or Dry |
| waterYear | - The water year (1st Oct - 30 Sept) to which the data are attached |
| Date | - The date the sample was collected |
| NOx | - Nitrite and Nitrate |
wq = read_csv('../public/data/aims.wq.csv', trim_ws=TRUE)
glimpse(wq)
## Rows: 618
## Columns: 57
## $ LATITUDE <dbl> -16.11152, -16.11383, -16.11261, -16.11308, -16.11343…
## $ LONGITUDE <dbl> 145.4833, 145.4827, 145.4844, 145.4868, 145.4835, 145…
## $ reef.alias <chr> "Cape Tribulation", "Cape Tribulation", "Cape Tribula…
## $ Water_Samples <chr> "AIMS", "AIMS", "AIMS", "AIMS", "AIMS", "AIMS", "AIMS…
## $ Region <chr> "Wet Tropics", "Wet Tropics", "Wet Tropics", "Wet Tro…
## $ Subregion <chr> "Barron Daintree", "Barron Daintree", "Barron Daintre…
## $ Season <chr> "Dry", "Wet", "Dry", "Wet", "Dry", "Dry", "Wet", "Dry…
## $ waterYear <dbl> 2008, 2008, 2009, 2009, 2009, 2010, 2010, 2010, 2011,…
## $ Date <date> 2007-10-14, 2008-03-30, 2008-10-12, 2009-02-26, 2009…
## $ Mnth <dbl> 10, 3, 10, 2, 6, 10, 3, 6, 10, 2, 6, 10, 2, 6, 10, 2,…
## $ Dt.num...11 <dbl> 2007.784, 2008.243, 2008.779, 2009.153, 2009.452, 200…
## $ Source <chr> "AIMS Niskin", "AIMS Niskin", "AIMS Niskin", "AIMS Ni…
## $ CDOM_443 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ CHL_TSS <dbl> 0.3101798, 0.1077403, 0.1927524, 0.4841206, 0.2699770…
## $ DIN <dbl> NA, 1.7603107, 1.8912551, 1.5700109, 0.9340067, 0.853…
## $ DIP <dbl> 3.2430236, 0.0000000, 8.4301889, 0.2818657, 3.3818466…
## $ DOC_DON <dbl> 9.894332, 18.838502, 10.247938, 8.853276, 35.173595, …
## $ DOC_DOP <dbl> 249.57930, 164.16894, 232.26052, 138.28572, 143.74990…
## $ DOC <dbl> 568.7456, 800.0811, 665.5129, 790.8947, 709.3114, 703…
## $ DON_DOP <dbl> 25.371405, 8.672342, 21.385520, 15.773521, 4.218696, …
## $ DON <dbl> 58.39177, 42.59066, 66.56530, 89.86323, 21.04308, 65.…
## $ DOP <dbl> 2.360146, 5.676724, -3.369768, 6.108445, 5.245931, 2.…
## $ DRIFTCHL_UGPERL <dbl> 0.2348550, 0.3407700, 0.3229300, 0.3026219, 0.3904875…
## $ DRIFTPHAE_UGPERL <dbl> 0.10667250, 0.23786250, 0.14444250, 0.11588125, 0.186…
## $ Dt.num...25 <dbl> 2007.784, 2008.243, 2008.779, 2009.153, 2009.452, 200…
## $ Dtt.num <dbl> 1192330200, 1206843900, 1223763420, 1235609580, 12450…
## $ HAND_NH4 <dbl> NA, 1.6579284, 1.1876348, 0.2446500, 0.1862600, 0.401…
## $ NH4 <dbl> 3.321632, 1.013287, 2.984873, 5.136503, 3.343649, 2.2…
## $ NO2 <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.1400100…
## $ NO3 <dbl> 0.8225237, 0.0000000, 0.2744196, 1.2644041, 0.6533917…
## $ NOx_PO4 <dbl> 0.233605384, Inf, 0.166195654, Inf, 0.207299638, 0.13…
## $ NOx <dbl> 0.8300237, 0.0100000, 0.2819196, 1.2675291, 0.7934017…
## $ PN_CHL <dbl> 50.81076, 70.97502, 43.20272, 32.14920, 40.25010, 31.…
## $ PN_PP <dbl> 5.457474, 4.841050, 5.852297, 5.035881, 5.801272, 4.0…
## $ PN_SHIM <dbl> 18.99411, 29.22387, 13.21873, 14.93224, 18.73589, 14.…
## $ PN_TSS <dbl> 0.017091541, 0.007517002, 0.008352101, 0.014976245, 0…
## $ PN <dbl> 9.556838, 17.876092, 13.877616, 9.648177, 13.335427, …
## $ PO4 <dbl> 3.2430236, 0.0000000, 8.4301889, 0.2818657, 3.3818466…
## $ POC_CHL <dbl> 643.2646, 594.1710, 320.9770, 300.8186, 334.7438, 274…
## $ POC_PN <dbl> 13.802976, 8.211349, 7.611680, 9.477119, 8.455543, 8.…
## $ POC_PP <dbl> 78.17430, 39.91906, 43.86293, 46.88621, 48.09479, 34.…
## $ POC_TSS <dbl> 0.28442285, 0.06276370, 0.06193767, 0.14314580, 0.093…
## $ POC <dbl> 126.47385, 147.34368, 103.58926, 89.97550, 110.71416,…
## $ PP_CHL <dbl> 9.073836, 13.775650, 7.352966, 6.429233, 6.769279, 7.…
## $ PP_TSS <dbl> 0.002787327, 0.001460411, 0.001423642, 0.003094262, 0…
## $ PP <dbl> 2.025051, 3.678694, 2.371837, 1.922521, 2.336996, 2.5…
## $ Salinity <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ SECCHI_DEPTH <dbl> 11.0, 4.0, 3.8, 6.0, 6.0, 11.0, 7.0, 3.5, 5.5, 6.5, 6…
## $ SI_NOx <dbl> 5345.75834, 32763.52291, 8925.66237, 5725.21940, 391.…
## $ SI_PO4 <dbl> 20.99143, Inf, 42.09525, Inf, 59.47313, 10.50605, 59.…
## $ SI <dbl> 70.47872, 327.63523, 119.92590, 196.88465, 160.96946,…
## $ TDN <dbl> 62.53592, 43.60394, 69.82460, 96.26414, 25.18013, 68.…
## $ TDP <dbl> 5.603170, 5.676724, 5.060421, 6.390311, 8.627777, 5.3…
## $ TN_TP <dbl> 9.567956, 6.958258, 11.240651, 13.392960, 3.586795, 1…
## $ TotalN <dbl> 72.09276, 61.48004, 83.70221, 105.91231, 38.51556, 78…
## $ TotalP <dbl> 7.628221, 9.355418, 7.432258, 8.312832, 10.964774, 7.…
## $ TSS_MGPERL <dbl> 1.052500, 3.120000, 1.695000, 0.761250, 1.505000, 1.4…
We will start by defining each of the categorical variables as factors. We will also define the random effect (reef.alias) as a factor.
wq = wq %>% mutate(reef.alias=factor(reef.alias),
Region=factor(Region),
Subregion=factor(Subregion),
Season=factor(Season))
Model formula: \[ y_i \sim{} \mathcal{N}(\mu_i, \sigma^2)\\ \mu_i =\beta_0 + f(Date_i) + f(Month_i) \]
where \(\beta_0\) is the y-intercept. \(f(Date)\) and \(f(Month)\) indicate the additive smoothing functions of the long-term temporal trends and the annual seasonal trends respectively.
If we begin with a quck scatterplot of NOx agains Date..
ggplot(wq, aes(y=NOx, x=Date)) + geom_point()
We know that these NOx values have been measured over time from a number of locations (reef.alias). Therefore, it would be good to facet the scatterplot conditional on the reef.alias.
ggplot(wq, aes(y=NOx, x=Date)) +
geom_point() +
facet_wrap(~reef.alias)
Conclusions:
ggplot(wq, aes(y=NOx, x=Date)) +
geom_point() +
geom_smooth() +
scale_y_continuous(trans=scales::pseudo_log_trans()) +
facet_wrap(~reef.alias, scales='free_y')
Conclusions:
Date data cannot be directly modelled, it must be converted into a numeric. This can be performed with the decimal_date() function within the lubridate package.
Although we generally want to scale any continuous predictors, it appears that doing so with Date objects has downstream issues - the models fit ok, but partial plots are displaced along the x-axis. So as an alternative either dont scale, or do so with a routine that does not automatically back-trasform (such as sjmisc::std or sjmisc::center).
wq=wq %>% mutate(Dt.num=decimal_date(Date))
#wq=wq %>% mutate(sDt=scale(Date))
When contemplating fitting a complex model, it is often good practice to build a model up, starting with something relatively simple. That way, if the model fails or yeilds strange outcomes, it is easier to diagnose the issues. In the spirit of this, we will begin by building up a temporal model for a single reef (Pandora), before moving on to generalise across multiple reefs in a mixed effects model.
wq.pandora=wq %>% filter(reef.alias=='Pandora', !is.na(NOx))
#wq.pandora=wq %>% filter(reef.alias=='Green', !is.na(NOx))
ggplot(wq.pandora, aes(y=NOx, x=Date)) + geom_point() +
geom_smooth() +
scale_y_log10()
Conclusions:
mgcv package (or any frequentist package for fitting GAM’s that I am aware of).wq.pandora=wq %>% filter(reef.alias=='Pandora', !is.na(NOx))
Actually,it might be worth exploring a variety of models:
wq.gam1 <- gam(NOx ~ s(Dt.num), data=wq.pandora, family='gaussian', method='REML')
wq.gam2 <- gam(NOx ~ s(Dt.num), data=wq.pandora, family=gaussian(link='log'), method='REML')
wq.gam3 <- gam(NOx ~ s(Dt.num), data=wq.pandora, family=Gamma(link='log'), method='REML')
wq.gam4 <- gam(NOx ~ s(Dt.num), data=wq.pandora, family=tw(link='log'), method='REML')
library(MuMIn)
AICc(wq.gam1, wq.gam2, wq.gam3, wq.gam4)
Conclusions:
k.check(wq.gam1)
## k' edf k-index p-value
## s(Dt.num) 9 2.149209 1.108005 0.6725
appraise(wq.gam1)
#performance::check_model(wq.gam1)
Conclusions:
wq.resid <- simulateResiduals(wq.gam1, plot=TRUE)
testTemporalAutocorrelation(wq.resid, time=wq.gam1$model$Dt.num)
##
## Durbin-Watson test
##
## data: simulationOutput$scaledResiduals ~ 1
## DW = 2.0394, p-value = 0.9078
## alternative hypothesis: true autocorrelation is not 0
Conclusions:
k.check(wq.gam2)
## k' edf k-index p-value
## s(Dt.num) 9 2.270725 1.101346 0.6975
appraise(wq.gam2)
#performance::check_model(wq.gam2)
Conclusions:
wq.resid <- simulateResiduals(wq.gam2, plot=TRUE)
testTemporalAutocorrelation(wq.resid, time=wq.gam2$model$Dt.num)
##
## Durbin-Watson test
##
## data: simulationOutput$scaledResiduals ~ 1
## DW = 2.0244, p-value = 0.9428
## alternative hypothesis: true autocorrelation is not 0
Conclusions:
k.check(wq.gam3)
## k' edf k-index p-value
## s(Dt.num) 9 5.076167 1.019582 0.645
appraise(wq.gam3)
#performance::check_model(wq.gam3)
Conclusions:
wq.resid <- simulateResiduals(wq.gam3, plot=TRUE)
testTemporalAutocorrelation(wq.resid, time=wq.gam3$model$Dt.num)
##
## Durbin-Watson test
##
## data: simulationOutput$scaledResiduals ~ 1
## DW = 2.1053, p-value = 0.7567
## alternative hypothesis: true autocorrelation is not 0
Conclusions:
k.check(wq.gam4)
## k' edf k-index p-value
## s(Dt.num) 9 5.088219 1.052492 0.7175
appraise(wq.gam4)
#performance::check_model(wq.gam4)
Conclusions:
#wq.resid <- simulateResiduals(wq.gam4, plot=TRUE)
#testTemporalAutocorrelation(wq.resid, time=wq.gam4$model$Dt.num)
Conclusions:
draw(wq.gam1)
plot(wq.gam1, pages=1, shift=coef(wq.gam1)[1], resid=TRUE, cex=4, scale=0)
draw(wq.gam2)
plot(wq.gam2, pages=1, shift=coef(wq.gam2)[1], resid=TRUE, trans=exp, cex=4, scale=0)
draw(wq.gam3)
plot(wq.gam3, pages=1, shift=coef(wq.gam3)[1], resid=TRUE, trans=exp, cex=4, scale=0)
draw(wq.gam4)
plot(wq.gam4, pages=1, shift=coef(wq.gam4)[1], resid=TRUE, trans=exp, cex=4, scale=0)
summary(wq.gam1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## NOx ~ s(Dt.num)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.905 0.290 6.569 2.52e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Dt.num) 2.149 2.682 1.532 0.217
##
## R-sq.(adj) = 0.0979 Deviance explained = 15.7%
## -REML = 66.635 Scale est. = 2.8596 n = 34
Conclusions:
tidy(wq.gam1)
summary(wq.gam2)
##
## Family: gaussian
## Link function: log
##
## Formula:
## NOx ~ s(Dt.num)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6433 0.1620 3.972 4e-04 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Dt.num) 2.271 2.866 0.677 0.561
##
## R-sq.(adj) = 0.0926 Deviance explained = 15.4%
## -REML = 68.172 Scale est. = 2.8816 n = 34
Conclusions:
tidy(wq.gam2)
summary(wq.gam3)
##
## Family: Gamma
## Link function: log
##
## Formula:
## NOx ~ s(Dt.num)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1664 0.1520 1.095 0.283
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Dt.num) 5.076 6.178 15.48 8.14e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.18 Deviance explained = 55.7%
## -REML = 47.788 Scale est. = 0.78522 n = 34
Conclusions:
tidy(wq.gam3)
summary(wq.gam4)
##
## Family: Tweedie(p=1.923)
## Link function: log
##
## Formula:
## NOx ~ s(Dt.num)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1750 0.1503 1.165 0.254
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Dt.num) 5.088 6.197 12.07 1.63e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.186 Deviance explained = 54.6%
## -REML = 47.665 Scale est. = 0.77308 n = 34
Conclusions:
tidy(wq.gam4)
It is likely that NOx levels vary throughout a year. This might be related to river runoff etc. By not accounting for this, the unexplained variation is relatively high and the power associated with the long-term trends will be relatively low.
We have two options for accounting for these intra-annual fluctuations:
wq.gam5 <- gam(NOx ~ s(Dt.num, by=Season),
data=wq.pandora,
family=Gamma(link='log'), method='REML')
k.check(wq.gam5)
## k' edf k-index p-value
## s(Dt.num):SeasonDry 9 4.129893 0.8994827 0.405
## s(Dt.num):SeasonWet 9 3.776405 0.8994827 0.335
appraise(wq.gam5)
#performance::check_model(wq.gam5)
Conclusions:
wq.resid <- simulateResiduals(wq.gam5, plot=TRUE)
testTemporalAutocorrelation(wq.resid, time=wq.gam5$model$Dt.num)
##
## Durbin-Watson test
##
## data: simulationOutput$scaledResiduals ~ 1
## DW = 1.9284, p-value = 0.8331
## alternative hypothesis: true autocorrelation is not 0
Conclusions:
draw(wq.gam5)
plot(wq.gam5, pages=1, shift=coef(wq.gam5)[1], resid=TRUE, trans=exp, cex=4, scale=0)
Cyclical smoothers force the start and end of the trend to line up. As such, they assume that the bounds of the data represent the extremes of the cycle. The AIMS water quality marine monitoring program does not visit every site in every month. Importantly, sites are rarely visited in either December or January. Of course, December and January are the bookends of a year.
To ensure that the cyclical smoother extends from December to January, we need to define the extent of the month codes. When doing so, we need to assign a number of knots and this must match the knots argument in the smoother (in this case k=5). A k of 5 was chosen arbitrarily to allow some wiggliness without going crazy.
wq.gam6 <- gam(NOx ~ s(Dt.num)+
s(Mnth,bs='cc',k=5,fx=F),
knots=list(Mnth=seq(1,12,length=5)),
data=wq.pandora,
family=Gamma(link='log'), method='REML')
k.check(wq.gam6)
## k' edf k-index p-value
## s(Dt.num) 9 5.566131 1.0524671 0.6500
## s(Mnth) 3 1.658295 0.7408556 0.1025
appraise(wq.gam6)
#performance::check_model(wq.gam6)
Conclusions:
wq.resid <- simulateResiduals(wq.gam6, plot=TRUE)
testTemporalAutocorrelation(wq.resid, time=wq.gam5$model$Dt.num)
##
## Durbin-Watson test
##
## data: simulationOutput$scaledResiduals ~ 1
## DW = 2.0885, p-value = 0.7945
## alternative hypothesis: true autocorrelation is not 0
Conclusions:
draw(wq.gam6)
plot(wq.gam6, pages=1, shift=coef(wq.gam5)[1], resid=TRUE, trans=exp, cex=4, scale=0)
AIC(wq.gam5, wq.gam6)
Conclusions:
summary(wq.gam6)
##
## Family: Gamma
## Link function: log
##
## Formula:
## NOx ~ s(Dt.num) + s(Mnth, bs = "cc", k = 5, fx = F)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.07122 0.13274 0.537 0.596
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Dt.num) 5.566 6.704 20.320 <2e-16 ***
## s(Mnth) 1.658 3.000 2.455 0.018 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.325 Deviance explained = 66.7%
## -REML = 46.3 Scale est. = 0.59909 n = 34
Conclusions:
wq.list = with(wq.pandora, list(Dt.num = seq(min(Dt.num), max(Dt.num), len=100)))
newdata = emmeans(wq.gam6, ~Dt.num, at=wq.list, type='response') %>% as.data.frame
head(newdata)
g1=ggplot(newdata, aes(y=response, x=date_decimal(Dt.num))) +
geom_ribbon(aes(ymin=lower.CL, ymax=upper.CL), fill='blue', alpha=0.3) +
geom_line() +
scale_x_datetime('') +
theme_bw()
wq.list = with(wq.pandora, list(Mnth = seq(1, 12, len=100)))
newdata1 = emmeans(wq.gam6, ~Mnth, at=wq.list, type='response') %>% as.data.frame
head(newdata1)
g2=ggplot(newdata1, aes(y=response, x=Mnth)) +
geom_ribbon(aes(ymin=lower.CL, ymax=upper.CL), fill='blue', alpha=0.3) +
geom_line() +
theme_bw()
#library(gridExtra)
#grid.arrange(g1, g2, nrow=1)
g1 + g2
## Partial residuals
wq.presid = data.frame(Dt.num=wq.gam6$model$Dt.num, Mnth=mean(wq.gam6$model$Mnth)) %>%
mutate(Pred = predict(wq.gam6, newdata=., type='link'),
Resid = wq.gam6$residuals,
Presid = exp(Pred + Resid))
head(wq.presid)
ggplot(newdata, aes(y=response, x=date_decimal(Dt.num))) +
geom_ribbon(aes(ymin=lower.CL, ymax=upper.CL), fill='blue', alpha=0.3) +
geom_line() +
scale_x_datetime('') +
theme_bw() +
geom_point(data=wq.presid, aes(y=Presid)) +
geom_point(data=wq.pandora, aes(y=NOx), color='red', alpha=0.5)
wq.presid=with(wq.gam6$model, data.frame(Dt.num=Dt.num, Mnth=mean(Mnth))) %>%
mutate(Resid=exp(as.vector(predict(wq.gam6, newdata=., type='link')) + wq.gam6$residuals))
ggplot(newdata, aes(y=response, x=date_decimal(Dt.num))) +
geom_ribbon(aes(ymin=lower.CL, ymax=upper.CL), fill='blue', alpha=0.3) +
geom_line() +
geom_point(data=wq.presid, aes(y=Resid)) +
scale_x_datetime('') +
theme_bw()
wq.sub <- wq %>%
group_by(reef.alias) %>%
mutate(Min=min(Dt.num)) %>%
ungroup() %>%
filter(Min<2012) %>%
droplevels
## reef=wq %>%
## group_by(reef.alias) %>%
## dplyr:::summarise(Min=min(Dt.num)) %>%
## filter(Min<2012) %>%
## pull(reef.alias)
## reef
## wq2=wq %>% filter(reef.alias %in% reef) %>% droplevels
ggplot(wq.sub, aes(y=NOx,x=Dt.num)) +
geom_point() +
facet_wrap(~reef.alias)
There are numerous ways of fitting mixed effects GAMs”
gamm() function and including a random formula. This will run the glmmPQL function behind the scenes. As such, it is not genuine likelihood and is therefore limitedgamm4 function from the package with the same name. This will run the glmer() behind the scenes.gam() function and including a random effects smootherggplot(wq, aes(y=NOx,x=Dt.num)) +
geom_point() +
facet_wrap(~reef.alias, scales='free_y')
## Some reefs dont have the full time series
wq.gamm1 <- gamm(NOx ~ s(Dt.num), random=list(reef.alias=~1),
data=wq, family=Gamma(link='log'), method='REML')
##
## Maximum number of PQL iterations: 20
k.check(wq.gamm1$gam)
## k' edf k-index p-value
## s(Dt.num) 9 7.366408 0.7185251 0
appraise(wq.gamm1$gam)
Conclusions:
#wq.resids <- simulateResiduals(wq.gamm1$gam, plot=TRUE)
Conclusions:
draw(wq.gamm1$gam)
plot(wq.gamm1$gam, pages=1, shift=fixef(wq.gamm1$lme)[1], resid=FALSE, trans=exp, cex=4, scale=0)
wq.gamm2 <- gamm4::gamm4(NOx ~ s(Dt.num), random=~(1|reef.alias),
data=wq, family=Gamma(link='log'), REML=TRUE)
k.check(wq.gamm2$gam)
## k' edf k-index p-value
## s(Dt.num) 9 8.061293 0.7166844 0
appraise(wq.gamm2$gam)
Conclusions:
#wq.resids <- simulateResiduals(wq.gamm2$gam, plot=TRUE)
Conclusions:
draw(wq.gamm2$gam)
plot(wq.gamm2$gam, pages=1, shift=fixef(wq.gamm2$mer)[1], resid=FALSE, trans=exp, cex=4, scale=0)
wq.gamm3 <- gam(NOx ~ s(Dt.num) + s(reef.alias, bs='re'),
data=wq, family=Gamma(link='log'), method='REML')
k.check(wq.gamm3)
## k' edf k-index p-value
## s(Dt.num) 9 8.090943 0.7049302 0
## s(reef.alias) 28 22.801475 NA NA
appraise(wq.gamm3)
Conclusions:
wq.resids <- simulateResiduals(wq.gamm3, plot=TRUE)
Conclusions:
draw(wq.gamm3)
plot(wq.gamm3, pages=1, shift=coef(wq.gamm3)[1], resid=FALSE, trans=exp, cex=4, scale=0)
wq.gamm3a <- gam(NOx ~ s(Dt.num, k=20) + s(reef.alias, bs='re'),
data=wq, family=Gamma(link='log'), method='REML')
k.check(wq.gamm3a)
## k' edf k-index p-value
## s(Dt.num) 19 14.94434 0.7543514 0
## s(reef.alias) 28 22.82759 NA NA
appraise(wq.gamm3a)
Conclusions:
wq.resids <- simulateResiduals(wq.gamm3a, plot=TRUE)
Conclusions:
draw(wq.gamm3a)
plot(wq.gamm3a, pages=1, shift=coef(wq.gamm3)[1], resid=FALSE, trans=exp, cex=4, scale=0)
wq.gamm3b <- gam(NOx ~ s(Dt.num, k=20)+
s(Mnth,bs='cc',k=5) +
s(reef.alias, bs='re'),
knots=list(Mnth=seq(1,12,length=5)),
family=Gamma(link='log'),
data=wq, method='REML')
k.check(wq.gamm3b)
## k' edf k-index p-value
## s(Dt.num) 19 14.941297 0.7622104 0.000
## s(Mnth) 3 2.027431 0.8520791 0.045
## s(reef.alias) 28 22.854663 NA NA
appraise(wq.gamm3b)
Conclusions:
the k-index and p-value (for both the long-term and seasonal trend) are low,
and the edf’s are approaching the respective k’ so it is likely that the number of knots is not overconstraining
the residual diagnostics are not too bad
wq.resids <- simulateResiduals(wq.gamm3b, plot=TRUE)
Conclusions:
draw(wq.gamm3b)
plot(wq.gamm3b, pages=1, shift=coef(wq.gamm3)[1], resid=FALSE, trans=exp, cex=4, scale=0)
## wq.gamm3c <- gam(NOx ~ s(Dt.num, k=20)+
## s(Mnth,bs='cc',k=5) +
## s(Region, bs='re') +
## s(reef.alias, bs='re'),
## knots=list(Mnth=seq(1,12,length=5)),
## family=Gamma(link='log'),
## data=wq, method='REML')
wq.gamm3c <- gam(NOx ~ s(Dt.num, by=Region, k=20)+
s(Mnth,bs='cc', by=Region, k=5) +
s(reef.alias, bs='re'),
knots=list(Mnth=seq(1,12,length=5)),
family=Gamma(link='log'),
data=wq, method='REML')
k.check(wq.gamm3c)
## k' edf k-index p-value
## s(Dt.num):RegionBurdekin 19 6.887115e+00 0.7588022 0.0000
## s(Dt.num):RegionMackay Whitsunday 19 4.235643e+00 0.7588022 0.0000
## s(Dt.num):RegionWet Tropics 19 8.888716e+00 0.7588022 0.0000
## s(Mnth):RegionBurdekin 3 1.541980e+00 0.8727190 0.1200
## s(Mnth):RegionMackay Whitsunday 3 2.322221e+00 0.8727190 0.1275
## s(Mnth):RegionWet Tropics 3 6.593186e-04 0.8727190 0.1125
## s(reef.alias) 28 2.241898e+01 NA NA
appraise(wq.gamm3c)
Conclusions:
wq.resids <- simulateResiduals(wq.gamm3c, plot=TRUE)
Conclusions:
draw(wq.gamm3c)
plot(wq.gamm3c, pages=1, shift=coef(wq.gamm3)[1], resid=FALSE, trans=exp, cex=4, scale=0)
Although the model that included smoothers conditional on Regions still had some outstanding issues with the DHARMa residuals (deviating from linear Q-Q and some non-linear pattern in the residuals), the model out-performs the other models and the violations are not too severe. It is highly likely that the model fits well and will yeild robust estimates.
summary(wq.gamm3c)
##
## Family: Gamma
## Link function: log
##
## Formula:
## NOx ~ s(Dt.num, by = Region, k = 20) + s(Mnth, bs = "cc", by = Region,
## k = 5) + s(reef.alias, bs = "re")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1199 0.1464 0.819 0.413
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Dt.num):RegionBurdekin 6.887e+00 8.496 11.925 < 2e-16 ***
## s(Dt.num):RegionMackay Whitsunday 4.236e+00 5.263 3.620 0.00272 **
## s(Dt.num):RegionWet Tropics 8.889e+00 10.930 18.653 < 2e-16 ***
## s(Mnth):RegionBurdekin 1.542e+00 3.000 1.099 0.11036
## s(Mnth):RegionMackay Whitsunday 2.322e+00 3.000 9.877 6.07e-07 ***
## s(Mnth):RegionWet Tropics 6.593e-04 3.000 0.000 0.70889
## s(reef.alias) 2.242e+01 27.000 5.265 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.243 Deviance explained = 47.6%
## -REML = 684.18 Scale est. = 1.1562 n = 568
Conclusions:
tidy(wq.gamm3c)
tidy(wq.gamm3c) %>% knitr::kable()
| term | edf | ref.df | statistic | p.value |
|---|---|---|---|---|
| s(Dt.num):RegionBurdekin | 6.8871153 | 8.496486 | 11.9254666 | 0.0000000 |
| s(Dt.num):RegionMackay Whitsunday | 4.2356428 | 5.262693 | 3.6200143 | 0.0027182 |
| s(Dt.num):RegionWet Tropics | 8.8887157 | 10.929975 | 18.6529647 | 0.0000000 |
| s(Mnth):RegionBurdekin | 1.5419803 | 3.000000 | 1.0991862 | 0.1103558 |
| s(Mnth):RegionMackay Whitsunday | 2.3222205 | 3.000000 | 9.8769822 | 0.0000006 |
| s(Mnth):RegionWet Tropics | 0.0006593 | 3.000000 | 0.0000766 | 0.7088895 |
| s(reef.alias) | 22.4189840 | 27.000000 | 5.2647762 | 0.0000000 |
The partial plots within each Region suggested that NOx concentrations peaked around 2014 before declining. We might be interested in describing the extent of this decline (between 2014 and 2016).
Marginalising over Regions, this is:
emmeans(wq.gamm3c, ~Dt.num, at=list(Dt.num=c(2016, 2014)),
type='response') %>%
pairs() %>%
confint()
Conclusions:
Alternatively, we could exlore the same declines for each Region.
emmeans(wq.gamm3c, ~Dt.num|Region, at=list(Dt.num=c(2016, 2014)),
type='response') %>%
pairs() %>%
confint
Conclusions:
Perhaps we will start by marginalising over Regions
wq.list <- with(wq, list(Dt.num = modelr::seq_range(Dt.num, n=100),
Region=levels(Region)))
newdata <- wq.gamm3c %>%
emmeans(~Dt.num, at=wq.list,
type='response',
#data=wq.gamm3c$model
) %>%
as.data.frame
head(newdata)
ggplot(newdata, aes(y=response, x=date_decimal(Dt.num))) +
geom_ribbon(aes(ymin=lower.CL, ymax=upper.CL), fill='blue', alpha=0.3) +
geom_line() +
scale_x_datetime('') +
theme_bw()
## Partial residuals
wq.presid <- with(wq.gamm3c$model,
data.frame(Dt.num, Mnth)) %>%
mutate(Pred = predict(wq.gamm3c, exclude='s(reef.alias)', type='link'),
Resid = wq.gamm3c$resid,
Partial.obs = exp(Pred + Resid))
Resid=exp(
as.vector(predict(wq.gamm3c, exclude='s(reef.alias)', type='link')) +
wq.gamm3c$residuals
)
head(wq.presid)
ggplot(newdata, aes(y=response, x=date_decimal(Dt.num))) +
geom_ribbon(aes(ymin=lower.CL, ymax=upper.CL), fill='blue', alpha=0.3) +
geom_line() +
geom_point(data=wq.presid, aes(y=Partial.obs)) +
geom_point(data=wq, aes(y=NOx, x=date_decimal(Dt.num)), color='red') +
scale_x_datetime('') +
scale_y_continuous(breaks=seq(0,10,by=2), limits=c(0,10))+
theme_bw()
Now conditional on Regions.
wq.list = with(wq, list(Dt.num = seq(min(Dt.num), max(Dt.num), len=100),
Region=levels(Region)))
newdata = emmeans(wq.gamm3c, ~Dt.num|Region, at=wq.list, type='response',
data=wq.gamm3c$model) %>% as.data.frame
head(newdata)
ggplot(newdata, aes(y=response, x=date_decimal(Dt.num))) +
geom_ribbon(aes(ymin=lower.CL, ymax=upper.CL), fill='blue', alpha=0.3) +
geom_line() +
scale_x_datetime('') +
facet_wrap(~Region, nrow=1) +
theme_bw()